Build ENCODE ATAC/ChIP-Seq pipeline indices

1 Introduction

For more details, refer to ENCODE ATAC-Seq pipeline from GitHub, ENCODE ATAC-Seq pipeline from Google doc, ENCODE ChIP-Seq pipeline from GitHub, and ENCODE ChIP-Seq pipeline from Google doc.

2 Extract mitochondrial genome if present

pigz -k -c -d Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.fa.gz | faOneRecord stdin MT > Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.mito_only.fa

3 Prepare Bowtie2 index of species/mitochondrial genome

#!/usr/bin/bash

ref_fa=/data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_atac_chip_seq/sugarglider.mito_only.fasta.gz
bowtie2_index_dir=/data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_atac_chip_seq
bowtie2_index_prefix=mito
bowtie2_n_threads=60
tmp_dir=/data/tmp

cd ${bowtie2_index_dir}

if [ ${ref_fa##*.} == "gz" ]
then
    new_ref_fa=${tmp_dir}/$(basename ${ref_fa} .gz)
    pigz -k -c -d ${ref_fa} > ${new_ref_fa}
else
    new_ref_fa=${ref_fa}
fi

bowtie2-build --threads ${bowtie2_n_threads} -f ${new_ref_fa} ${bowtie2_index_prefix}

tar -cvf ${bowtie2_index_prefix}.bt2_index.tar *.bt2

rm *.bt2

if [ ${ref_fa##*.} == "gz" ]
then
    rm ${new_ref_fa}
fi

4 Prepare transcription start sites

Note: GFF3 is 1-based coordinate system, and BED is 0-based coordinate system.

library(rtracklayer)
library(vroom)
library(tidyverse)

gff_file <- "/data/biodatabase/species/mRatBN7/genome/anno/Rattus_norvegicus.mRatBN7.2.111.gff3.gz"
output_file <- "/data/biodatabase/species/mRatBN7/encode_references/bulk_atac_chip_seq/mRatBN7.tss.bed"
target <- "gene"

df <- import(gff_file, format = "gff3") %>%
    as.data.frame() %>%
    as_tibble()
df <- df %>%
    filter(type %in% target) %>%
    select(seqnames, start, end, strand) %>%
    mutate(
        tss_start = if_else(strand %in% c("+", ".", "*"), start - 1, end - 1),
        tss_end = if_else(strand %in% c("+", ".", "*"), start, end)
    ) %>%
    select(seqnames, tss_start, tss_end) %>%
    arrange(seqnames, tss_start, tss_end) %>%
    distinct()

vroom_write(df, file = output_file, col_names = FALSE, append = FALSE)
system2("/usr/bin/pigz", output_file)